options(warn=-2)
library(pacman)
p_load(dplyr)
source('../lib/import.R')
#
# Es una librería de funciones comunes desarrolladas a partir de este TP.
import('../lib/common-lib.R')## [1] "-> './reflection.R' script loadded successfuly!"
## [1] "-> './data-frame.R' script loadded successfuly!"
## [1] "-> './csv.R' script loadded successfuly!"
## [1] "-> './plot.R' script loadded successfuly!"
## [1] "-> './data-frame.R' script loadded successfuly!"
## [1] "-> './importance.R' script loadded successfuly!"
## [1] "-> './correlation.R' script loadded successfuly!"
## [1] "-> './pca.R' script loadded successfuly!"
## [1] "-> './set_split.R' script loadded successfuly!"
## [1] "-> './models.R' script loadded successfuly!"
## ---
## biotools version 4.1
## [1] "-> './data-frame.R' script loadded successfuly!"
## [1] "-> './test.R' script loadded successfuly!"
## [1] "-> './metrics.R' script loadded successfuly!"
## [1] "-> './balance.R' script loadded successfuly!"
## [1] "-> '../lib/common-lib.R' script loadded successfuly!"
#
# Funciones especificas para este TP.
import('../scripts/helper_functions.R')## [1] "-> '../lib/pca.R' script loadded successfuly!"
## [1] "-> './data-frame.R' script loadded successfuly!"
## [1] "-> '../lib/importance.R' script loadded successfuly!"
## [1] "-> '../scripts/helper_functions.R' script loadded successfuly!"
set.seed(1)dataset <- loadcsv('../datasets/nasa.csv')
str(dataset)## 'data.frame': 4687 obs. of 40 variables:
## $ Neo.Reference.ID : int 3703080 3723955 2446862 3092506 3514799 3671135 2495323 2153315 2162463 2306383 ...
## $ Name : int 3703080 3723955 2446862 3092506 3514799 3671135 2495323 2153315 2162463 2306383 ...
## $ Absolute.Magnitude : num 21.6 21.3 20.3 27.4 21.6 19.6 19.6 19.2 17.8 21.5 ...
## $ Est.Dia.in.KM.min. : num 0.1272 0.1461 0.2315 0.0088 0.1272 ...
## $ Est.Dia.in.KM.max. : num 0.2845 0.3266 0.5177 0.0197 0.2845 ...
## $ Est.Dia.in.M.min. : num 127.2 146.1 231.5 8.8 127.2 ...
## $ Est.Dia.in.M.max. : num 284.5 326.6 517.7 19.7 284.5 ...
## $ Est.Dia.in.Miles.min. : num 0.07905 0.09076 0.14385 0.00547 0.07905 ...
## $ Est.Dia.in.Miles.max. : num 0.1768 0.203 0.3217 0.0122 0.1768 ...
## $ Est.Dia.in.Feet.min. : num 417.4 479.2 759.5 28.9 417.4 ...
## $ Est.Dia.in.Feet.max. : num 933.3 1071.6 1698.3 64.6 933.3 ...
## $ Close.Approach.Date : chr "1995-01-01" "1995-01-01" "1995-01-08" "1995-01-15" ...
## $ Epoch.Date.Close.Approach : num 7.89e+11 7.89e+11 7.90e+11 7.90e+11 7.90e+11 ...
## $ Relative.Velocity.km.per.sec: num 6.12 18.11 7.59 11.17 9.84 ...
## $ Relative.Velocity.km.per.hr : num 22017 65210 27327 40226 35427 ...
## $ Miles.per.hour : num 13681 40519 16980 24995 22013 ...
## $ Miss.Dist..Astronomical. : num 0.419 0.383 0.051 0.285 0.408 ...
## $ Miss.Dist..lunar. : num 163.2 149 19.8 111 158.6 ...
## $ Miss.Dist..kilometers. : num 62753692 57298148 7622912 42683616 61010824 ...
## $ Miss.Dist..miles. : num 38993336 35603420 4736658 26522368 37910368 ...
## $ Orbiting.Body : chr "Earth" "Earth" "Earth" "Earth" ...
## $ Orbit.ID : int 17 21 22 7 25 40 43 22 100 30 ...
## $ Orbit.Determination.Date : chr "2017-04-06 08:36:37" "2017-04-06 08:32:49" "2017-04-06 09:20:19" "2017-04-06 09:15:49" ...
## $ Orbit.Uncertainity : int 5 3 0 6 1 1 1 0 0 0 ...
## $ Minimum.Orbit.Intersection : num 0.02528 0.18693 0.04306 0.00551 0.0348 ...
## $ Jupiter.Tisserand.Invariant : num 4.63 5.46 4.56 5.09 5.15 ...
## $ Epoch.Osculation : num 2458000 2458000 2458000 2458000 2458000 ...
## $ Eccentricity : num 0.426 0.352 0.348 0.217 0.21 ...
## $ Semi.Major.Axis : num 1.41 1.11 1.46 1.26 1.23 ...
## $ Inclination : num 6.03 28.41 4.24 7.91 16.79 ...
## $ Asc.Node.Longitude : num 314.4 136.7 259.5 57.2 84.6 ...
## $ Orbital.Period : num 610 426 644 514 496 ...
## $ Perihelion.Distance : num 0.808 0.718 0.951 0.984 0.968 ...
## $ Perihelion.Arg : num 57.3 313.1 248.4 18.7 158.3 ...
## $ Aphelion.Dist : num 2.01 1.5 1.97 1.53 1.48 ...
## $ Perihelion.Time : num 2458162 2457795 2458120 2457902 2457814 ...
## $ Mean.Anomaly : num 264.8 173.7 292.9 68.7 135.1 ...
## $ Mean.Motion : num 0.591 0.845 0.559 0.7 0.726 ...
## $ Equinox : chr "J2000" "J2000" "J2000" "J2000" ...
## $ Hazardous : chr "True" "False" "True" "False" ...
excluded_columns <- c(
'Neo.Reference.ID',
'Name',
'Close.Approach.Date',
'Epoch.Date.Close.Approach',
'Orbit.ID',
'Orbit.Determination.Date',
'Orbiting.Body',
'Est.Dia.in.Feet.min.',
'Est.Dia.in.Feet.max.',
'Est.Dia.in.M.min.',
'Est.Dia.in.M.max.',
'Est.Dia.in.KM.min.',
'Est.Dia.in.KM.max.',
'Equinox',
'Orbit.Uncertainity'
)
ds_step_1 <- dataset %>% dplyr::select(-excluded_columns) %>% na.omit## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(excluded_columns)` instead of `excluded_columns` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
rm(dataset)
str(ds_step_1)## 'data.frame': 4687 obs. of 25 variables:
## $ Absolute.Magnitude : num 21.6 21.3 20.3 27.4 21.6 19.6 19.6 19.2 17.8 21.5 ...
## $ Est.Dia.in.Miles.min. : num 0.07905 0.09076 0.14385 0.00547 0.07905 ...
## $ Est.Dia.in.Miles.max. : num 0.1768 0.203 0.3217 0.0122 0.1768 ...
## $ Relative.Velocity.km.per.sec: num 6.12 18.11 7.59 11.17 9.84 ...
## $ Relative.Velocity.km.per.hr : num 22017 65210 27327 40226 35427 ...
## $ Miles.per.hour : num 13681 40519 16980 24995 22013 ...
## $ Miss.Dist..Astronomical. : num 0.419 0.383 0.051 0.285 0.408 ...
## $ Miss.Dist..lunar. : num 163.2 149 19.8 111 158.6 ...
## $ Miss.Dist..kilometers. : num 62753692 57298148 7622912 42683616 61010824 ...
## $ Miss.Dist..miles. : num 38993336 35603420 4736658 26522368 37910368 ...
## $ Minimum.Orbit.Intersection : num 0.02528 0.18693 0.04306 0.00551 0.0348 ...
## $ Jupiter.Tisserand.Invariant : num 4.63 5.46 4.56 5.09 5.15 ...
## $ Epoch.Osculation : num 2458000 2458000 2458000 2458000 2458000 ...
## $ Eccentricity : num 0.426 0.352 0.348 0.217 0.21 ...
## $ Semi.Major.Axis : num 1.41 1.11 1.46 1.26 1.23 ...
## $ Inclination : num 6.03 28.41 4.24 7.91 16.79 ...
## $ Asc.Node.Longitude : num 314.4 136.7 259.5 57.2 84.6 ...
## $ Orbital.Period : num 610 426 644 514 496 ...
## $ Perihelion.Distance : num 0.808 0.718 0.951 0.984 0.968 ...
## $ Perihelion.Arg : num 57.3 313.1 248.4 18.7 158.3 ...
## $ Aphelion.Dist : num 2.01 1.5 1.97 1.53 1.48 ...
## $ Perihelion.Time : num 2458162 2457795 2458120 2457902 2457814 ...
## $ Mean.Anomaly : num 264.8 173.7 292.9 68.7 135.1 ...
## $ Mean.Motion : num 0.591 0.845 0.559 0.7 0.726 ...
## $ Hazardous : chr "True" "False" "True" "False" ...
Excluimos las columnas que tienen un correlación mayor al 80%. Del grupo altamenten correlacionesdos nos quedamos con una sola variable ya que todas con muy simialres.
high_correlated_columns <- find_high_correlated_columns(
feat(ds_step_1),
cutoff=0.9
)## Compare row 21 and column 15 with corr 0.975
## Means: 0.285 vs 0.213 so flagging column 21
## Compare row 9 and column 7 with corr 1
## Means: 0.287 vs 0.206 so flagging column 9
## Compare row 7 and column 10 with corr 1
## Means: 0.253 vs 0.199 so flagging column 7
## Compare row 10 and column 8 with corr 1
## Means: 0.216 vs 0.196 so flagging column 10
## Compare row 12 and column 15 with corr 0.93
## Means: 0.271 vs 0.191 so flagging column 12
## Compare row 15 and column 18 with corr 0.995
## Means: 0.226 vs 0.184 so flagging column 15
## Compare row 4 and column 5 with corr 1
## Means: 0.29 vs 0.175 so flagging column 4
## Compare row 5 and column 6 with corr 1
## Means: 0.246 vs 0.163 so flagging column 5
## Compare row 3 and column 2 with corr 1
## Means: 0.219 vs 0.153 so flagging column 3
## Compare row 22 and column 13 with corr 0.978
## Means: 0.127 vs 0.151 so flagging column 13
## All correlations <= 0.9
ds_step_2 <- ds_step_1 %>% dplyr::select(-high_correlated_columns[-1])
rm(ds_step_1)
str(ds_step_2)## 'data.frame': 4687 obs. of 16 variables:
## $ Absolute.Magnitude : num 21.6 21.3 20.3 27.4 21.6 19.6 19.6 19.2 17.8 21.5 ...
## $ Est.Dia.in.Miles.min. : num 0.07905 0.09076 0.14385 0.00547 0.07905 ...
## $ Miles.per.hour : num 13681 40519 16980 24995 22013 ...
## $ Miss.Dist..lunar. : num 163.2 149 19.8 111 158.6 ...
## $ Minimum.Orbit.Intersection: num 0.02528 0.18693 0.04306 0.00551 0.0348 ...
## $ Eccentricity : num 0.426 0.352 0.348 0.217 0.21 ...
## $ Inclination : num 6.03 28.41 4.24 7.91 16.79 ...
## $ Asc.Node.Longitude : num 314.4 136.7 259.5 57.2 84.6 ...
## $ Orbital.Period : num 610 426 644 514 496 ...
## $ Perihelion.Distance : num 0.808 0.718 0.951 0.984 0.968 ...
## $ Perihelion.Arg : num 57.3 313.1 248.4 18.7 158.3 ...
## $ Aphelion.Dist : num 2.01 1.5 1.97 1.53 1.48 ...
## $ Perihelion.Time : num 2458162 2457795 2458120 2457902 2457814 ...
## $ Mean.Anomaly : num 264.8 173.7 292.9 68.7 135.1 ...
## $ Mean.Motion : num 0.591 0.845 0.559 0.7 0.726 ...
## $ Hazardous : chr "True" "False" "True" "False" ...
Para realizar este paso vamos a utilizar la función feature_importance del algoritmo Random Forest. Esta función nos permite comparar las variables descuerdo a cuan buenas son para separa las clases. Luego tomaremos las 5 variables que mejor separa las clases.
result <- features_importance(ds_step_2, target = 'Hazardous')## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(target)` instead of `target` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
result##
## Call:
## randomForest(x = features, y = target, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 0.28%
## Confusion matrix:
## False True class.error
## False 3926 6 0.001525941
## True 7 748 0.009271523
plot_features_importance(result)n_best_features = 5
# n_best_features = 15
best_features <- top_acc_features(result, top=n_best_features)## Selecting by MeanDecreaseGini
best_features## [1] "Minimum.Orbit.Intersection" "Absolute.Magnitude"
## [3] "Est.Dia.in.Miles.min." "Perihelion.Distance"
## [5] "Inclination"
length(best_features)## [1] 5
ds_step_3 <- ds_step_2 %>% dplyr::select(c(best_features, c(Hazardous)))## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(best_features)` instead of `best_features` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
rm(ds_step_2)str(ds_step_3)## 'data.frame': 4687 obs. of 6 variables:
## $ Minimum.Orbit.Intersection: num 0.02528 0.18693 0.04306 0.00551 0.0348 ...
## $ Absolute.Magnitude : num 21.6 21.3 20.3 27.4 21.6 19.6 19.6 19.2 17.8 21.5 ...
## $ Est.Dia.in.Miles.min. : num 0.07905 0.09076 0.14385 0.00547 0.07905 ...
## $ Perihelion.Distance : num 0.808 0.718 0.951 0.984 0.968 ...
## $ Inclination : num 6.03 28.41 4.24 7.91 16.79 ...
## $ Hazardous : chr "True" "False" "True" "False" ...
# ds_step_4 <- filter_outliers_m1(ds_step_3, max_score = 0.51)
ds_step_4 <- filter_outliers_m2(ds_step_3)## [1] "Outliers: 4 %"
## [1] "Dataset without outliers: 96 %"
rm(ds_step_3)ds_step_5 <- ds_step_4 %>%
mutate(Hazardous = case_when(Hazardous %in% c('True') ~ 1, TRUE ~ 0))
rm(ds_step_4)
str(ds_step_5)## 'data.frame': 4493 obs. of 6 variables:
## $ Minimum.Orbit.Intersection: num 0.02528 0.18693 0.04306 0.00551 0.0348 ...
## $ Absolute.Magnitude : num 21.6 21.3 20.3 27.4 21.6 19.6 19.6 19.2 17.8 21.5 ...
## $ Est.Dia.in.Miles.min. : num 0.07905 0.09076 0.14385 0.00547 0.07905 ...
## $ Perihelion.Distance : num 0.808 0.718 0.951 0.984 0.968 ...
## $ Inclination : num 6.03 28.41 4.24 7.91 16.79 ...
## $ Hazardous : num 1 0 1 0 1 0 0 0 0 1 ...
coparative_boxplot(feat(ds_step_5), to_col=n_best_features)comparative_histplot(feat(ds_step_5), to_col=n_best_features)## [[1]]
## [1] 3.7e-24
##
## [[2]]
## [1] 3.7e-24
##
## [[3]]
## [1] 3.7e-24
##
## [[4]]
## [1] 3.7e-24
##
## [[5]]
## [1] 3.7e-24
Observaciones: Al parecer solo Perihelion.Distance parece ser normal o por lo enos la mas nromal de todas.
uni_shapiro_test(feat(ds_step_5))## [1] "=> Variable: Minimum.Orbit.Intersection"
## [1] ""
##
## Shapiro-Wilk normality test
##
## data: features[, col_index]
## W = 0.83111, p-value < 2.2e-16
##
## [1] "=> Variable: Absolute.Magnitude"
## [1] ""
##
## Shapiro-Wilk normality test
##
## data: features[, col_index]
## W = 0.9821, p-value < 2.2e-16
##
## [1] "=> Variable: Est.Dia.in.Miles.min."
## [1] ""
##
## Shapiro-Wilk normality test
##
## data: features[, col_index]
## W = 0.75483, p-value < 2.2e-16
##
## [1] "=> Variable: Perihelion.Distance"
## [1] ""
##
## Shapiro-Wilk normality test
##
## data: features[, col_index]
## W = 0.98592, p-value < 2.2e-16
##
## [1] "=> Variable: Inclination"
## [1] ""
##
## Shapiro-Wilk normality test
##
## data: features[, col_index]
## W = 0.91502, p-value < 2.2e-16
Observaciones: En todos los casos el p-valor < 0.05 y se rechaza normalidad en todas las variables. Esto coincido con los qqplot donde en todos los casos no son normales salvo Perihelion.Distance que parece tender anormalidad.
mult_shapiro_test(feat(ds_step_5))##
## Shapiro-Wilk normality test
##
## data: Z
## W = 0.71502, p-value < 2.2e-16
Observaciones: El p-valore < 0.05 por lo tanto se rechaza normalidad multi-variada. Se corresponde con el resultado de los tests de shapiro uni-variados y los qqplot’s.
multi_boxm_test(feat(ds_step_5), ds_step_5$Hazardous)##
## Box's M-test for Homogeneity of Covariance Matrices
##
## data: features
## Chi-Sq (approx.) = 3242.4, df = 15, p-value < 2.2e-16
Observaciones: El p-valor < 0.05 por lo tanto se rechaza la hipótesis nula y podemos decir que las variables no son homocedasticas.
plot_correlations(feat(ds_step_5))## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 4493 individuals, described by 6 variables
## *The results are available in the following objects:
##
## name description
## 1 "$eig" "eigenvalues"
## 2 "$var" "results for the variables"
## 3 "$var$coord" "coord. for the variables"
## 4 "$var$cor" "correlations variables - dimensions"
## 5 "$var$cos2" "cos2 for the variables"
## 6 "$var$contrib" "contributions of the variables"
## 7 "$ind" "results for the individuals"
## 8 "$ind$coord" "coord. for the individuals"
## 9 "$ind$cos2" "cos2 for the individuals"
## 10 "$ind$contrib" "contributions of the individuals"
## 11 "$call" "summary statistics"
## 12 "$call$centre" "mean of the variables"
## 13 "$call$ecart.type" "standard error of the variables"
## 14 "$call$row.w" "weights for the individuals"
## 15 "$call$col.w" "weights for the variables"
## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 4493 individuals, described by 5 variables
## *The results are available in the following objects:
##
## name description
## 1 "$eig" "eigenvalues"
## 2 "$var" "results for the variables"
## 3 "$var$coord" "coord. for the variables"
## 4 "$var$cor" "correlations variables - dimensions"
## 5 "$var$cos2" "cos2 for the variables"
## 6 "$var$contrib" "contributions of the variables"
## 7 "$ind" "results for the individuals"
## 8 "$ind$coord" "coord. for the individuals"
## 9 "$ind$cos2" "cos2 for the individuals"
## 10 "$ind$contrib" "contributions of the individuals"
## 11 "$call" "summary statistics"
## 12 "$call$centre" "mean of the variables"
## 13 "$call$ecart.type" "standard error of the variables"
## 14 "$call$row.w" "weights for the individuals"
## 15 "$call$col.w" "weights for the variables"
outliers para poder ver con mas claridad el biplot.
plot_robust_pca(ds_step_5)Partimos el dataset en los conjuntos de entrenamiento y prueba. Ademas antes de partimos ordenamos las observaciones aleatoriamente para que los modelos de clasificación no aprendan secuencia si es que existe alguna en el orden original.
c(raw_train_set, raw_test_set) %<-% train_test_split(
ds_step_5,
train_size = .8,
shuffle = TRUE
)## [1] "Train Set size: 3594"
## [1] "Test set size: 899"
# balanced_train_set <- smote_balance(raw_train_set, raw_train_set$Hazardous)
#rm(raw_train_set)
balanced_train_set <- raw_train_setRestamos la media y dividimos por el desvío.
train_set <- balanced_train_set %>%
mutate_at(vars(-Hazardous), ~(scale(.) %>% as.vector))
test_set <- raw_test_set %>%
mutate_at(vars(-Hazardous), ~(scale(.) %>% as.vector))
rm(balanced_train_set)
rm(raw_test_set)lda_model <- lda(formula(Hazardous~.), train_set)
lda_test_pred <- predict(lda_model, feat(test_set))
lda_train_pred <- predict(lda_model, feat(train_set))plot_cm(lda_test_pred$class, target(test_set))plot_roc(lda_test_pred$class, target(test_set))##
## Method used: empirical
## ===== Positive(s) =====
## Number of positive(s): 149
## Mean of positive(s): 1.691
## Variance of positive(s): 0.2149
## ===== Negative(s) =====
## Number of negative(s): 750
## Mean of negative(s): 1.029
## Variance of negative(s): 0.02851
## ===== AUC =====
## Area under curve: 0.831
##
## =========================
## FPR TPR
## 0.00000 0.0000
## 0.02933 0.6913
## 1.00000 1.0000
fbeta_score(prediction=lda_test_pred$class, reality=target(test_set), beta=2)## [1] " F2Score: 0.714285714285714"
lda_model$scaling## LD1
## Minimum.Orbit.Intersection -1.25901613
## Absolute.Magnitude -1.61036721
## Est.Dia.in.Miles.min. -0.32194894
## Perihelion.Distance 0.09444703
## Inclination -0.04451865
qda_model <- qda(formula(Hazardous~.), train_set)
qda_test_pred <- predict(qda_model, feat(test_set))
qda_train_pred <- predict(qda_model, feat(train_set))plot_cm(qda_test_pred$class, target(test_set))plot_roc(qda_test_pred$class, target(test_set))##
## Method used: empirical
## ===== Positive(s) =====
## Number of positive(s): 149
## Mean of positive(s): 1.94
## Variance of positive(s): 0.05714
## ===== Negative(s) =====
## Number of negative(s): 750
## Mean of negative(s): 1.019
## Variance of negative(s): 0.01834
## ===== AUC =====
## Area under curve: 0.9605
##
## =========================
## FPR TPR
## 0.00000 0.0000
## 0.01867 0.9396
## 1.00000 1.0000
fbeta_score(prediction=qda_test_pred$class, reality=target(test_set), beta=2)## [1] " F2Score: 0.933333333333333"
rda_model <- rda(formula(Hazardous~.), train_set)
rda_test_pred <- predict(rda_model, test_set)
rda_train_pred <- predict(rda_model, train_set)plot_cm(rda_test_pred$class, target(test_set))plot_roc(rda_test_pred$class, target(test_set))##
## Method used: empirical
## ===== Positive(s) =====
## Number of positive(s): 149
## Mean of positive(s): 1.94
## Variance of positive(s): 0.05714
## ===== Negative(s) =====
## Number of negative(s): 750
## Mean of negative(s): 1.019
## Variance of negative(s): 0.01834
## ===== AUC =====
## Area under curve: 0.9605
##
## =========================
## FPR TPR
## 0.00000 0.0000
## 0.01867 0.9396
## 1.00000 1.0000
fbeta_score(rda_test_pred$class, target(test_set), beta=2)## [1] " F2Score: 0.933333333333333"
lr_model <- glm(formula(Hazardous~.), train_set, family=binomial)
lr_threshold <- 0.01
lr_test_pred <- predict(lr_model, test_set)
lr_test_pred_threshold <- ifelse(lr_test_pred >= lr_threshold, 1, 0)
lr_train_pred <- predict(lr_model, train_set)
lr_train_pred_threshold <- ifelse(lr_train_pred >= lr_threshold, 1, 0)plot_cm(lr_test_pred_threshold, target(test_set))plot_roc(lr_test_pred_threshold, target(test_set))##
## Method used: empirical
## ===== Positive(s) =====
## Number of positive(s): 149
## Mean of positive(s): 0.8322
## Variance of positive(s): 0.1406
## ===== Negative(s) =====
## Number of negative(s): 750
## Mean of negative(s): 0.02
## Variance of negative(s): 0.01963
## ===== AUC =====
## Area under curve: 0.9061
##
## =========================
## FPR TPR
## 0.00 0.0000
## 0.02 0.8322
## 1.00 1.0000
fbeta_score(lr_test_pred_threshold, target(test_set), beta=2)## [1] " F2Score: 0.843537414965986"
svm_model <- svm(formula(Hazardous~.), train_set, kernel = 'radial')
svm_threshold <- 0.20
svm_test_pred <- predict(svm_model, test_set)
svm_test_pred_threshold <- ifelse(svm_test_pred >= svm_threshold, 1, 0)
svm_train_pred <- predict(svm_model, train_set)
svm_train_pred_threshold <- ifelse(svm_train_pred >= svm_threshold, 1, 0)plot_cm(svm_test_pred_threshold, target(test_set))plot_roc(svm_test_pred_threshold, target(test_set))##
## Method used: empirical
## ===== Positive(s) =====
## Number of positive(s): 149
## Mean of positive(s): 0.9866
## Variance of positive(s): 0.01333
## ===== Negative(s) =====
## Number of negative(s): 750
## Mean of negative(s): 0.06933
## Variance of negative(s): 0.06461
## ===== AUC =====
## Area under curve: 0.9586
##
## =========================
## FPR TPR
## 0.00000 0.0000
## 0.06933 0.9866
## 1.00000 1.0000
fbeta_score(svm_test_pred_threshold, target(test_set), beta=2)## [1] " F2Score: 0.924528301886793"
xgboost_model <- xgboost(
as.matrix(feat(train_set)),
target(train_set),
eta = 0.2,
max_depth = 20,
nround = 15000,
eval_metric = 'logloss',
objective = "binary:logistic",
nthread = 24,
verbose = 0
)xgb_threshold <- 0.1
xgb_test_pred <- predict(xgboost_model, as.matrix(feat(test_set)))
xgb_test_pred_threshold <- ifelse(xgb_test_pred >= xgb_threshold, 1, 0)
xgb_train_pred <- predict(xgboost_model, as.matrix(feat(train_set)))
xgb_train_pred_threshold <- ifelse(xgb_train_pred >= xgb_threshold, 1, 0)
plot_cm(xgb_test_pred_threshold, target(test_set))plot_roc(xgb_test_pred_threshold, target(test_set))##
## Method used: empirical
## ===== Positive(s) =====
## Number of positive(s): 149
## Mean of positive(s): 0.9732
## Variance of positive(s): 0.0263
## ===== Negative(s) =====
## Number of negative(s): 750
## Mean of negative(s): 0.009333
## Variance of negative(s): 0.009259
## ===== AUC =====
## Area under curve: 0.9819
##
## =========================
## FPR TPR
## 0.000000 0.0000
## 0.009333 0.9732
## 1.000000 1.0000
fbeta_score(xgb_test_pred_threshold, target(test_set), beta=2)## [1] " F2Score: 0.969251336898396"
data.frame(
Test.F2Score.Percent = c(
fbeta_score(lda_test_pred$class, target(test_set), beta=2, show=F),
fbeta_score(qda_test_pred$class, target(test_set), beta=2, show=F),
fbeta_score(rda_test_pred$class, target(test_set), beta=2, show=F),
fbeta_score(lr_test_pred_threshold, target(test_set), beta=2, show=F),
fbeta_score(svm_test_pred_threshold, target(test_set), beta=2, show=F),
fbeta_score(xgb_test_pred_threshold, target(test_set), beta=2, show=F)
),
Train.F2Score.Percent = c(
fbeta_score(lda_train_pred$class, target(train_set), beta=2, show=F),
fbeta_score(qda_train_pred$class, target(train_set), beta=2, show=F),
fbeta_score(rda_train_pred$class, target(train_set), beta=2, show=F),
fbeta_score(lr_train_pred_threshold, target(train_set), beta=2, show=F),
fbeta_score(svm_train_pred_threshold, target(train_set), beta=2, show=F),
fbeta_score(xgb_train_pred_threshold, target(train_set), beta=2, show=F)
),
Model = c(
'LDA',
'QDA',
'RDA',
'Regresion Logistica',
'SVM',
'XGBoost'
),
Umbral = c(
0.5,
0.5,
0.5,
lr_threshold,
svm_threshold,
xgb_threshold
)
) %>%
arrange(desc(Test.F2Score.Percent)) %>%
mutate(Diff.F2Score.Percent = abs(Test.F2Score.Percent - Train.F2Score.Percent)) %>%
mutate(
Test.F2Score.Percent = round(Test.F2Score.Percent * 100, 3),
Train.F2Score.Percent = round(Train.F2Score.Percent * 100, 3),
Diff.F2Score.Percent = round(Diff.F2Score.Percent, 3)
) %>%
dplyr::select(
Test.F2Score.Percent,
Train.F2Score.Percent,
Diff.F2Score.Percent,
Model,
Umbral
)Típica con estimadores de la normal.
scaled_data_1 <- feat(ds_step_5) %>% mutate(~(scale(.) %>% as.vector))clustering_metrics_plot(scaled_data_1)Escalamiento diferente de la típica normal.
scaled_data_2 <- apply(
feat(ds_step_5),
2,
function(x) { (x - min(x)) / (max(x) - min(x))}
)clustering_metrics_plot(scaled_data_2)n_clusters <- 2
kmeans_model <- kmeans(scaled_data_1, n_clusters)
km_ds_step_5 <- data.frame(ds_step_5)
km_ds_step_5$kmeans <- kmeans_model$cluster# ds_without_outliers <- filter_outliers_m1(km_ds_step_5, max_score=0.5)
ds_without_outliers <- filter_outliers_m2(km_ds_step_5)## [1] "Outliers: 5 %"
## [1] "Dataset without outliers: 95 %"
plot_robust_pca(
ds_without_outliers %>% dplyr::select(-kmeans),
groups = factor(ds_without_outliers$kmeans),
)Matriz de distancias euclídeas
mat_dist <- dist(x = ds_step_5, method = "euclidean") Dendrogramas (según el tipo de segmentación jerárquica aplicada)
hc_complete <- hclust(d = mat_dist, method = "complete")
hc_average <- hclust(d = mat_dist, method = "average")
hc_single <- hclust(d = mat_dist, method = "single")
hc_ward <- hclust(d = mat_dist, method = "ward.D2")Calculo del coeficiente de correlación cofenetico
cor(x = mat_dist, cophenetic(hc_complete))## [1] 0.7394402
cor(x = mat_dist, cophenetic(hc_average))## [1] 0.7706933
cor(x = mat_dist, cophenetic(hc_single))## [1] 0.4175976
cor(x = mat_dist, cophenetic(hc_ward))## [1] 0.7243304
Construcción de un dendograma usando los resultados de la técnica de Ward
n_clusters <- 2
graphics.off()
plot(hc_ward )
rect.hclust(hc_ward, k=n_clusters, border="red")hc_ds <- data.frame(ds_step_5)
hc_ds$her_ward <- cutree(hc_ward, k=n_clusters)# ds_without_outliers <- filter_outlier_m1(hc_ds, max_score=0.57)
ds_without_outliers <- filter_outliers_m2(hc_ds)## [1] "Outliers: 5 %"
## [1] "Dataset without outliers: 95 %"
plot_robust_pca(
ds_without_outliers %>% dplyr::select(-her_ward),
groups = factor(ds_without_outliers$her_ward),
colours=c("orange","cyan","blue","magenta","yellow","black"),
labels=c("grupo 1", "grupo 2","grupo 3","grupo 4","grupo 5","grupo 6")
)Realizado por Adrian Marino
adrianmarino@gmail.com